In [1]:
import pandas
from collections import defaultdict
In [2]:
pah = pandas.read_csv('enwiki_pah_misalignment.tsv',sep='\t')
In [3]:
pah.head()
Out[3]:
In [4]:
pah.groupby('dissonance').head()
Out[4]:
In [5]:
lines = open('english-qid-names2016-03-27.csv','r').readlines()
In [6]:
qidnames = {}
for line in lines:
qid, ennamen = line.split(',', maxsplit=1)
enname = ennamen.split('\n')[0]
if enname:
qidnames[qid]=enname
In [7]:
import json
In [8]:
json.dump(qidnames, open('qid_enpage.json','w'))
In [9]:
endf = pandas.DataFrame.from_dict(qidnames,orient='index')
In [10]:
len(endf)
Out[10]:
Todo:
+ map gender-enwiki-page-id
+ dissoance class priors
+ posteriors by gender
+ posterior for no gender.
In [11]:
bigdf = pandas.read_csv('/media/notconfusing/9d9b45fc-55f7-428c-a228-1c4c4a1b728c/home/maximilianklein/snapshot_data/2016-01-03/gender-index-data-2016-01-03.csv')
In [12]:
gender_qid_df = bigdf[['qid','gender']]
In [13]:
def map_gender(x):
if isinstance(x,float):
return 'no gender'
else:
gen = x.split('|')[0]
if gen == 'Q6581072':
return 'female'
elif gen == 'Q6581097':
return 'male'
else:
return 'nonbin'
gender_qid_df['gender'] = gender_qid_df['gender'].apply(map_gender)
In [14]:
def qid2enname(x):
try:
return qidnames[x]
except KeyError:
return None
gender_qid_df['enname'] = gender_qid_df['qid'].apply(qid2enname)
In [15]:
enname_id = pandas.read_csv('/home/notconfusing/workspace/wikidumpparse/wikidump/mediawiki-utilities/enname_id.txt',sep='\t',names=['enname','pageid'])
In [16]:
gender_page_id = pandas.merge(gender_qid_df, enname_id, how='inner',on='enname')
In [17]:
pah_gender = pandas.merge(pah, gender_page_id, how='left', on='pageid')
In [18]:
pah_gender
Out[18]:
In [19]:
len(pah), len(gender_page_id), len(pah_gender)
Out[19]:
Rel risk. P(gender|misaligned)/P(gender)
What proportion of the misaligned dataset is about women?
For each gender, what proportion of the each misalignment group do the represent.
In [20]:
pah_gender['gender'] = pah_gender['gender'].fillna('nonbio')
In [68]:
SE = pah_gender[(pah_gender['dissonance'] == 'Moderate negative') | (pah_gender['dissonance'] == 'High negative')]
NI = pah_gender[(pah_gender['dissonance'] == 'Moderate positive') | (pah_gender['dissonance'] == 'High positive')]
rel_risk = defaultdict(dict)
for risk, risk_name in [(SE,'Spent Effort'), (NI,'Needs Improvement')]:
for gender in ['female','male','nonbin','nonbio']:
gen_mis = len(risk[risk['gender'] == gender])
p_gen_mis = gen_mis/len(risk) #p(gender|misalignment)
p_gen = len(pah_gender[pah_gender['gender'] == gender]) / len(pah_gender) #p(gender)
print(p_gen_mis, p_gen)
rel_risk[gender][risk_name] = p_gen_mis/p_gen#rel sirk
In [24]:
java_min_int = -2147483648
allrecs = pandas.read_csv('/media/notconfusing/9d9b45fc-55f7-428c-a228-1c4c4a1b728c/home/maximilianklein/snapshot_data/2016-01-03/gender-index-data-2016-01-03.csv',na_values=[java_min_int])
In [26]:
def sum_column(q_str):
if type(q_str) is str:
qs = q_str.split('|')
return len(qs) #cos the format will always end with a |
for col in ['site_links']:
allrecs[col] = allrecs[col].apply(sum_column)
In [27]:
allrecs['site_links'].head(20)
Out[27]:
In [29]:
allrecs['gender'] = allrecs['gender'].apply(map_gender)
In [78]:
sl_risk = defaultdict(dict)
sl_risk['nonbio']['Sitelink Ratio'] = 1
for gender in ['female','male','nonbin']:
gend_df = allrecs[allrecs['gender']==gender]
gend_df_size = len(gend_df)
avg_sl = (gend_df['site_links'].sum() / gend_df_size) / 2.6
sl_risk[gender]['Sitelink Ratio'] = avg_sl
In [79]:
sl_risk_df = pandas.DataFrame.from_dict(sl_risk, orient='index')
In [80]:
rel_risk_df = pandas.DataFrame.from_dict(rel_risk,orient="index")
In [81]:
risk_df = pandas.DataFrame.join(sl_risk_df,rel_risk_df)
In [82]:
risk_df.index = ['Female','Male','Non-binary','Non-biography']
In [83]:
print(risk_df.to_latex(columns = ['Needs Improvement','Spent Effort', 'Sitelink Ratio'],float_format=lambda n:'%.2f' %n))